In [9]:
conda install pandas
Collecting package metadata (current_repodata.json): done
Solving environment: done


==> WARNING: A newer version of conda exists. <==
  current version: 4.10.3
  latest version: 23.3.1

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.
In [10]:
import pandas as pd
import os
In [11]:
operations = pd.read_csv("OperationsAssociateRawFile.csv")
In [12]:
print(operations)
       patient_id  provider_id insurance_type                       dob  \
0           46344           24           Zeta  1981-07-02T00:00:00.000Z   
1           46999           24     Beta Blues  1983-04-04T00:00:00.000Z   
2           47198           24     Beta Blues  1983-04-04T00:00:00.000Z   
3           48294           24     Beta Blues  1984-05-10T00:00:00.000Z   
4           49648           24     Beta Blues  1966-12-10T00:00:00.000Z   
...           ...          ...            ...                       ...   
65878      128707        13182   Alpha NorCal  1986-08-15T00:00:00.000Z   
65879      127456        13203   Alpha NorCal  2011-10-21T00:00:00.000Z   
65880      127638        13203   Alpha NorCal  2013-01-27T00:00:00.000Z   
65881      127976        13203   Alpha NorCal  1968-08-08T00:00:00.000Z   
65882      129117        13267           Cash  1998-06-03T00:00:00.000Z   

       gender              date_created booked_from state        utm_campaign  \
0        Male  2022-07-03T23:19:16.743Z        grow    FL            1.39E+11   
1      Female  2022-07-06T15:10:32.063Z        grow    FL                 NaN   
2      Female  2022-07-06T20:39:39.774Z        grow    FL            1.39E+11   
3      Female  2022-07-11T10:19:08.470Z        grow    FL            1.39E+11   
4      Female  2022-07-14T16:38:29.598Z        grow    FL                 NaN   
...       ...                       ...         ...   ...                 ...   
65878  Female  2022-12-30T04:25:55.105Z        grow    CA       35-email-prov   
65879  Female  2022-12-28T16:25:21.364Z        grow    CA  1-standard-profile   
65880  Female  2022-12-28T19:33:13.220Z        grow    CA  1-standard-profile   
65881    Male  2022-12-29T04:42:21.056Z        grow    CA                 NaN   
65882  Female  2022-12-30T23:00:45.425Z        grow    NY            1.32E+11   

      utm_source utm_medium  
0         google        cpc  
1           grow    organic  
2         google        cpc  
3         google        cpc  
4           grow    organic  
...          ...        ...  
65878      Alpha      email  
65879      Alpha       text  
65880      Alpha      email  
65881       grow    organic  
65882     google        cpc  

[65883 rows x 11 columns]
In [13]:
duplicates = operations.duplicated(subset=['patient_id'])
print('Number of duplicates in patient_id column:', duplicates.sum())
Number of duplicates in patient_id column: 0
In [14]:
unique_utm_medium = operations['utm_medium'].unique()
print(unique_utm_medium)
['cpc' 'organic' 'booking-link' 'api' 'text' 'email' 'call' 'profile-link'
 'switch' 'form' 'email for scheduling.' 'affliate' 'referral' 'Yext'
 "email'"]
In [15]:
count_utm_medium = operations['utm_medium'].value_counts()
print(count_utm_medium)
#api is used the most 
api                      21971
organic                  15911
email                     9596
cpc                       7590
text                      4916
call                      2480
booking-link              1719
switch                    1177
profile-link               499
affliate                    18
Yext                         2
form                         1
email for scheduling.        1
referral                     1
email'                       1
Name: utm_medium, dtype: int64
In [8]:
count_utm_medium = operations['utm_medium'].value_counts()
count_utm_medium.plot(kind='barh', color='skyblue')
plt.title('Count of UTM Medium')
plt.xlabel('Count')
plt.ylabel('UTM Medium')
plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
/var/folders/bl/5kk4pcgd6vqdpfcs3h7lpqn80000gn/T/ipykernel_1283/465834050.py in <module>
      1 count_utm_medium = operations['utm_medium'].value_counts()
      2 count_utm_medium.plot(kind='barh', color='skyblue')
----> 3 plt.title('Count of UTM Medium')
      4 plt.xlabel('Count')
      5 plt.ylabel('UTM Medium')

NameError: name 'plt' is not defined
In [16]:
unique_utm_source = operations['utm_source'].unique()
print(unique_utm_source)
['google' 'grow' 'healthgrades' 'zocdoc' 'unk-source' 'grow-reclaim' 'mhm'
 'psychology-today' 'Zeta -directory' 'choosingtherapy' 'bing' 'pt'
 'office' 'therapy-for-black-girls' 'therapyden' 'facebook' 'caredash'
 'provider-sourced' 'GetWell-health-directory' 'Alpha' 'humana-directory'
 'aircall' 'optum' 'mental-health-match' 'zencare' 'c-directory'
 'bcbs-directory' 'tiktok' 'optum-directory' 'therapy-for-black-men'
 'therapy-tribe' 'Gamma-directory' 'latinx' 'univ-ill-bcbs' 'grow-rematch'
 'choosing-therapy' 'therapy-for-latinx' 'stackadapt' 'tfbm' 'gmb' 'yext'
 'sendgrid' 'sendinblue' 'choosing_therapy' 'talkiatry' 'extnet'
 'google-adc' 'good-therapy' 'hs_email' 'hs_automation']
In [17]:
# Get the count of each unique value in the 'utm_source' column
utm_source_counts = operations['utm_source'].value_counts()
print(utm_source_counts)

# Zocdoc has the most 
zocdoc                      22045
grow                        16134
Alpha                        9335
google                       6302
psychology-today             2299
pt                           1526
unk-source                   1333
office                       1259
grow-reclaim                  985
bing                          938
provider-sourced              855
therapy-for-black-girls       471
c-directory                   358
mhm                           274
facebook                      204
tiktok                        180
optum                         171
bcbs-directory                139
healthgrades                  134
GetWell-health-directory      126
therapy-for-black-men         114
zencare                       104
caredash                       94
aircall                        92
choosingtherapy                61
mental-health-match            56
Zeta -directory                55
Gamma-directory                53
hs_automation                  45
therapyden                     42
grow-rematch                   21
choosing_therapy               18
humana-directory               18
optum-directory                 9
univ-ill-bcbs                   6
sendgrid                        6
choosing-therapy                4
therapy-for-latinx              2
extnet                          2
good-therapy                    2
hs_email                        2
stackadapt                      1
tfbm                            1
latinx                          1
yext                            1
sendinblue                      1
therapy-tribe                   1
talkiatry                       1
google-adc                      1
gmb                             1
Name: utm_source, dtype: int64
In [18]:
import plotly.graph_objects as go
import matplotlib.pyplot as plt
!pip install squarify
import squarify
Requirement already satisfied: squarify in ./opt/anaconda3/lib/python3.9/site-packages (0.4.3)
In [19]:
# Set up data
utm_source_counts = operations['utm_source'].value_counts()

# Define color for bars
bar_color = '#b48c1f'

# Create bar plot
fig, ax = plt.subplots(figsize=(10,6))
plt.bar(utm_source_counts.index, utm_source_counts.values, color=bar_color)

# Rotate x-axis labels
plt.xticks(rotation=90)

# Set plot title and axis labels
plt.title('Number of Observations by UTM Source')
plt.xlabel('UTM Source')
plt.ylabel('Number of tracking of sources')

plt.show()
In [20]:
# Get a list of all unique values in the 'utm_campaign' column
utm_campaign_types = operations['utm_campaign'].unique()

# Print the list of unique values
print(utm_campaign_types)
['1.39E+11' nan '1-standard-profile' '12-zendesk' '5-abandoned-link-grow'
 '1.32E+15' '32-intercom' '1.32E+11' '1.36E+11' '1.33E+11' '1.35E+11'
 '1.42E+11'
 'f1039|grow-therapy|client|rmk|conv|all|a|---|ca|booking-page-visitors'
 '16-internal-referral' '1.33E+15' '6-abandoned-link-zocdoc' 'intercom'
 '1.44E+11' '1.31E+15' '33-grow-cancel'
 'f1038|grow-therapy|client|pst|conv|florida|a|18-45|lal|appointment-confirmation-0-120-0-1'
 'tik1005|grow-therapy|client|pst|conv|florida|f|18-34|vi|health-wellness-video-interactions'
 'tik1006|grow-therapy|client|pst|conv|florida|a|18-34|lal|clicks-6-sec-vid-view'
 'tik1002|grow-therapy|client|pst|conv|florida|f|18-34|int|self-care-interests'
 '26-grow-rematch' '1.38E+11' '29-abandoned-first-apt'
 'f1036|grow-therapy|client|rmk|conv|all|a|---|ca|site-visitors-started-booking-event'
 '35-email-prov'
 'tik1008|grow-therapy|client|pst|conv|florida|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions'
 'client_share'
 'f1045|grow-therapy|client|pst|conv|florida|a|18-45|lal|appointment-confirmation-0-120-0-1'
 'f1042|grow-therapy|client|pst|conv|florida|f|18-24|dit|therapy-self-care-interests'
 'sa1005|grow-therapy|client|pst|display|conv|florida|all|---|pcai|therapy-counseling-pcai'
 'tik1020|grow-therapy|client|pst|conv|virginia|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions'
 'f1041|grow-therapy|client|pst|conv|pennsylvania|a|18-45|lal|appointment-confirmation-0-120-0-1'
 'f1059|grow-therapy|client|pst|conv|tx-ga-pa-ct-md-in|a|18-45|lal|appointment-confirmation-0-120-0-1'
 'tik1017|grow-therapy|client|pst-test|conv|fl-tx|f|18-34|int-vi|interests-video-interactions-homepage-url-test'
 '13-gmb-trial'
 'f1040|grow-therapy|client|pst|conv|georgia|a|18-45|lal|appointment-confirmation-0-120-0-1'
 'tik1018|grow-therapy|client|pst-exp-aud-test|conv|fl-tx|f|18-34|int-vi|interests-video-interactions-control-audience'
 '1.43E+11'
 '1-standard-profile https://provider.growtherapy.com/book-appointment?id=974'
 '1.41E+11'
 'tik1010|grow-therapy|client|pst|conv|texas|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions'
 'mham_beforeapril16'
 'tik1009|grow-therapy|client|pst|conv|pennsylvania|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions'
 '1-standard-profile"'
 'f1048|grow-therapy|client|pst|conv|florida|f|18-24|dit|therapy-self-care-interests'
 'f1054|grow-therapy|client|pst|conv|texas|a|18-45|lal|appointment-confirmation-0-120-0-1'
 'tik1004|grow-therapy|client|pst|conv|pennsylvania|f|18-34|int|self-care-interests'
 'f1032|grow-therapy|client|rmk|conv|all|a|---|ca|booking-page-visitors'
 '1-standard-profilehttps://provider.growtherapy.com/book-appointment?id=1589'
 'Reschedule_Request_Individual_links_Grow_1' 'email_name'
 '24-talkiatry-test'
 'tik1022|grow-therapy|client|pst|conv|illinois|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions'
 'mham_afterapril16'
 'tik1003|grow-therapy|client|pst|conv|georgia|f|18-34|int|self-care-interests'
 'tik1015|grow-therapy|client|rmk|conv|all|a|---|ca|clicks-6-sec-vid-view-site-visitors'
 '1-standard-'
 'tik1013|grow-therapy|client|pst|conv|maryland|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions'
 'tik1014|grow-therapy|client|pst|conv|connecticut|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions'
 '12-zendesk/'
 'tik1016|grow-therapy|client|pst-test|conv|fl-tx|f|18-34|int-vi|interests-video-interactions-filters-url-test'
 'tik1007|grow-therapy|client|rmk|conv|all|a|---|ca|clicks-6-sec-vid-view'
 'tik1019|grow-therapy|client|pst-exp-aud-test|conv|fl-tx|f|18-34|int-vi|interests-video-interactions-expanded-targeting-audience'
 'tik1011|grow-therapy|client|pst|conv|indiana|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions'
 'f1033|grow-therapy|client|ret|conv|all|a|---|ca|appointment-confirmation-visitors'
 '1-standard-profilepsychoatrist that accept anthem'
 'f1052|grow-therapy|client|pst|conv|pennsylvania|f|18-24|dit|therapy-self-care-interests'
 'tik1021|grow-therapy|client|rmk|conv|all|a|---|ca|site-visitors-100%-views'
 'f1051|grow-therapy|client|pst|conv|georgia|f|18-24|dit|therapy-self-care-interests'
 '1.34E+11' '1.45E+11'
 'f1060|grow-therapy|client|rmk|conv|all|a|---|ca|booking-page-visitors'
 '1.48E+11']
In [21]:
# Get the count of each unique value in the 'utm_campaign' column
utm_campaign_count = operations['utm_campaign'].value_counts()

# Print the count of each unique value
print(utm_campaign_count)


# 1-stabndard profikle has the most 
1-standard-profile                                                                                                     32788
12-zendesk                                                                                                              6565
1.32E+11                                                                                                                1724
1.39E+11                                                                                                                1704
16-internal-referral                                                                                                     825
                                                                                                                       ...  
12-zendesk/                                                                                                                1
tik1013|grow-therapy|client|pst|conv|maryland|f|18-34|int-vi|self-care-interests-health-wellness-video-interactions        1
1-standard-                                                                                                                1
Reschedule_Request_Individual_links_Grow_1                                                                                 1
1.48E+11                                                                                                                   1
Name: utm_campaign, Length: 76, dtype: int64
In [22]:
# Convert the date_created column to a datetime data type
operations['date_created'] = pd.to_datetime(operations['date_created'])

# Group the data by month and count the number of unique patient IDs
monthly_patients = operations.groupby(pd.Grouper(key='date_created', freq='M'))['patient_id'].nunique()

# Print the resulting series
print(monthly_patients)

#number of patients that registered per month
date_created
2022-07-31 00:00:00+00:00     7627
2022-08-31 00:00:00+00:00    10028
2022-09-30 00:00:00+00:00    11054
2022-10-31 00:00:00+00:00    12049
2022-11-30 00:00:00+00:00    13028
2022-12-31 00:00:00+00:00    12097
Freq: M, Name: patient_id, dtype: int64
In [23]:
import plotly.express as px

# Convert the date_created column to a datetime data type
operations['date_created'] = pd.to_datetime(operations['date_created'])

# Group the data by month and count the number of unique patient IDs
monthly_patients = operations.groupby(pd.Grouper(key='date_created', freq='M'))['patient_id'].nunique().reset_index()
monthly_patients['month_year'] = monthly_patients['date_created'].dt.strftime('%Y-%b')

# Create a treemap figure
fig = px.treemap(monthly_patients, path=['month_year'], values='patient_id')

# Add the counts to the treemap labels
fig.data[0].textinfo = 'label+text+value'
fig.data[0].textfont.size = 25

# Show the figure
fig.show()
In [24]:
# Convert the date_created column to a datetime format
operations["date_created"] = pd.to_datetime(operations["date_created"])

# Extract the day of the week from the date_created column
operations["day_of_week"] = operations["date_created"].dt.day_name()

# Count the occurrences of each day of the week
day_counts = operations["day_of_week"].value_counts()

# Print the most popular day of the week
print("The most popular day of the week patients registered for is:", day_counts.index[0])
The most popular day of the week patients registered for is: Tuesday
In [25]:
import pandas as pd
import plotly.express as px

# Define an ordered list of weekdays
ordered_weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Sort the day_of_week column based on the ordered list
operations['day_of_week'] = pd.Categorical(operations['day_of_week'], categories=ordered_weekdays, ordered=True)

# Count the occurrences of each day of the week and sort the resulting Series
day_counts = operations['day_of_week'].value_counts().sort_index()

# Create a DataFrame with the day names and their counts
df = pd.DataFrame({'Day of Week': day_counts.index, 'Count': day_counts.values})

# Create a Bubble Chart with a Colorscale
fig = px.scatter(df, x='Day of Week', y='Count', size='Count', color='Count',
                 color_continuous_scale='reds')

# Set the chart title and axis labels
fig.update_layout(title='Patient Registrations by Day of Week',
                  xaxis_title='Day of Week',
                  yaxis_title='Number of Registrations')

# Show the chart
fig.show()
In [26]:
# group the patients by insurance type and count the number of unique patient ids
insurance_counts = operations.groupby('insurance_type')['patient_id'].nunique()

# calculate the total number of patients
total_patients = operations['patient_id'].nunique()

# calculate the percentage of patients for each insurance company
insurance_percentages = insurance_counts / total_patients * 100

# find the insurance company with the highest percentage of patients
highest_percentage = insurance_percentages.idxmax()

print("The insurance company with the highest percentage of patients is:", highest_percentage)
The insurance company with the highest percentage of patients is: Beta Blues
In [ ]:
import plotly.express as px

# group the patients by insurance type and count the number of unique patient ids
insurance_counts = operations.groupby('insurance_type')['patient_id'].nunique()

# calculate the total number of patients
total_patients = operations['patient_id'].nunique()

# calculate the percentage of patients for each insurance company
insurance_percentages = insurance_counts / total_patients * 100

# filter for insurance groups that have more than 1%
insurance_percentages_filtered = insurance_percentages[insurance_percentages > 1]

# create a DataFrame with the filtered percentages and insurance types
df = pd.DataFrame({'Insurance Type': insurance_percentages_filtered.index,
                   'Percentage': insurance_percentages_filtered.values})

# create a pie chart with a color gradient
fig = px.pie(df, values='Percentage', names='Insurance Type',
             color='Insurance Type', color_discrete_sequence=px.colors.qualitative.Safe)

# set the chart title
fig.update_layout(title='Percentage of Insurance Types')

# show the chart
fig.show()
In [27]:
lowest_insurance_company = operations.groupby('insurance_type')['patient_id'].nunique() / operations['patient_id'].nunique()
lowest_insurance_company = lowest_insurance_company.idxmin()
print("The insurance company with the lowest percentage of patients is:", lowest_insurance_company)
The insurance company with the lowest percentage of patients is: ANTHEM
In [28]:
states = operations['state'].unique()
print(states)
['FL' 'NH' 'TX' 'CA' 'SC' 'IN' 'NY' 'GA' 'OH' 'CT' 'PR' 'KY' 'IL' 'DE'
 'VA' 'NE' 'PA' 'VT' 'CO' 'AK' 'Florida' 'OR' nan 'WY' 'MA' 'WA' 'NC' 'MD'
 'WI' 'NJ' 'RI' 'TN' 'MI' 'AL' 'DC' 'HI' 'MN' 'ME' 'AZ' 'OK' 'LA' 'UT'
 'VI' 'MS' 'NM' 'KS' 'MO' 'NV' 'AR' 'MT' 'ID' 'IA' 'SD' 'WV']
In [29]:
florida_count = (operations['state'] == 'Florida').sum()
print("Number of times Florida appears in booked_fromstate column:", florida_count)
Number of times Florida appears in booked_fromstate column: 2
In [30]:
operations["state"] = operations["state"].replace("Florida", "FL")
In [31]:
states = operations['state'].unique()
print(states)
['FL' 'NH' 'TX' 'CA' 'SC' 'IN' 'NY' 'GA' 'OH' 'CT' 'PR' 'KY' 'IL' 'DE'
 'VA' 'NE' 'PA' 'VT' 'CO' 'AK' 'OR' nan 'WY' 'MA' 'WA' 'NC' 'MD' 'WI' 'NJ'
 'RI' 'TN' 'MI' 'AL' 'DC' 'HI' 'MN' 'ME' 'AZ' 'OK' 'LA' 'UT' 'VI' 'MS'
 'NM' 'KS' 'MO' 'NV' 'AR' 'MT' 'ID' 'IA' 'SD' 'WV']
In [32]:
num_unique_states = operations['state'].nunique()
print(f"There are {num_unique_states} unique states in the operations dataframe.")
There are 52 unique states in the operations dataframe.
In [33]:
# Check for duplicates in the states column
duplicates = operations['state'].duplicated()

# Print the results
if duplicates.any():
    print("There are duplicates in the states column")
else:
    print("There are no duplicates in the states column")
There are duplicates in the states column
In [34]:
# Group the dataframe by state and count the number of patients in each state
state_counts = operations.groupby('state')['patient_id'].count()

# Find the state with the highest number of patients
highest_state = state_counts.idxmax()
highest_count = state_counts.loc[highest_state]

# Find the state with the lowest number of patients
lowest_state = state_counts.idxmin()
lowest_count = state_counts.loc[lowest_state]

# Print the results
print(f"The state with the highest number of patients is {highest_state} with {highest_count} patients.")
print(f"The state with the lowest number of patients is {lowest_state} with {lowest_count} patients.")
The state with the highest number of patients is FL with 16506 patients.
The state with the lowest number of patients is VI with 2 patients.
In [35]:
import plotly.express as px

# Group the patients by state and count the number of patients in each state
state_counts = operations.groupby('state')['patient_id'].count().reset_index()

# Find the state with the highest number of patients
highest_state = state_counts.loc[state_counts['patient_id'].idxmax(), 'state']
highest_count = state_counts.loc[state_counts['patient_id'].idxmax(), 'patient_id']

# Find the state with the lowest number of patients
lowest_state = state_counts.loc[state_counts['patient_id'].idxmin(), 'state']
lowest_count = state_counts.loc[state_counts['patient_id'].idxmin(), 'patient_id']

# Create a US map figure
fig = px.choropleth(state_counts, locations='state', locationmode='USA-states', 
                    color='patient_id', scope='usa', color_continuous_scale='YlOrRd')

# Customize the figure
fig.update_layout(title='Patient Counts by State',
                  annotations=[
                      dict(
                          x=0.5,
                          y=-0.1,
                          xref='paper',
                          yref='paper',
                          showarrow=False,
                          text=f'Highest count in {highest_state}: {highest_count} patients | Lowest count in {lowest_state}: {lowest_count} patients'
                      )
                  ])

# Show the figure
fig.show()
In [36]:
booked_from_counts = operations['booked_from'].value_counts()
print(booked_from_counts)
grow      43889
zocdoc    21994
Name: booked_from, dtype: int64
In [37]:
import pandas as pd
import matplotlib.pyplot as plt

# Get the value counts for the 'booked_from' column
booked_from_counts = operations['booked_from'].value_counts()

# Create a DataFrame with the counts and the corresponding index values
df = pd.DataFrame({'booked_from': booked_from_counts.index, 'count': booked_from_counts.values})

# Sort the DataFrame by count values in descending order
df = df.sort_values('count', ascending=False)

# Create the lollipop plot
plt.stem(df['booked_from'], df['count'], basefmt=' ')

# Set the x-axis label and title
plt.xlabel('booked_from')
plt.title('Counts of Booked From')

# Show the plot
plt.show()
In [38]:
unique_providers = operations['provider_id'].unique()
print(unique_providers)
[   24    37    38 ... 13182 13203 13267]
In [39]:
# group patients by provider and count number of patients for each provider
provider_counts = operations.groupby('provider_id')['patient_id'].count()

# get provider with highest and lowest patient counts
max_provider = provider_counts.idxmax()
min_provider = provider_counts.idxmin()

print(f"The provider with the most patients is {max_provider} with {provider_counts[max_provider]} patients.")
print(f"The provider with the least patients is {min_provider} with {provider_counts[min_provider]} patients.")
The provider with the most patients is 3244 with 441 patients.
The provider with the least patients is 38 with 1 patients.
In [40]:
provider_patient_counts = operations.groupby('provider_id')['patient_id'].nunique()

print(provider_patient_counts)
provider_id
24       29
37       43
38        1
44       21
45       18
         ..
13148     1
13155     1
13182     5
13203     3
13267     1
Name: patient_id, Length: 3287, dtype: int64
In [41]:
duplicate_counts = operations.duplicated(subset=['provider_id'], keep=False).groupby(operations['provider_id']).sum().astype(int)
print(duplicate_counts)
provider_id
24       29
37       43
38        0
44       21
45       18
         ..
13148     0
13155     0
13182     5
13203     3
13267     0
Length: 3287, dtype: int64
In [42]:
import pandas as pd
import plotly.express as px

# calculate the number of duplicates per provider
duplicate_counts = operations.duplicated(subset=['provider_id'], keep=False).groupby(operations['provider_id']).sum().astype(int)

# sort providers by number of duplicates in descending order and select top 5
top_providers = duplicate_counts.sort_values(ascending=False).head(5)

# create a dataframe with the selected providers and their duplicate counts
df = pd.DataFrame({'Provider ID': top_providers.index, 'Duplicate Count': top_providers.values})

# create a stacked donut chart
fig = px.sunburst(df, path=['Provider ID'], values='Duplicate Count', color_discrete_sequence=px.colors.qualitative.Pastel)

# set chart title
fig.update_layout(title='Top 5 Providers with the Most Duplicates')

# show the chart
fig.show()
In [43]:
provider_duplicate_counts = operations.duplicated(subset=['provider_id'], keep=False).groupby(operations['provider_id']).sum().astype(int)
print(provider_duplicate_counts)
provider_id
24       29
37       43
38        0
44       21
45       18
         ..
13148     0
13155     0
13182     5
13203     3
13267     0
Length: 3287, dtype: int64
In [44]:
import matplotlib.pyplot as plt

# Count number of occurrences for each provider ID
provider_counts = operations['provider_id'].value_counts()

# Get top 10 providers with highest number of occurrences
top_providers = provider_counts.head(10)

# Create pie chart
fig, ax = plt.subplots()
ax.pie(top_providers.values, labels=top_providers.index, autopct='%1.1f%%', startangle=90)

# Set title
ax.set_title('Top 10 Providers with Highest Number of Occurrences')

# Show plot
plt.show()
In [45]:
gender_counts = operations['gender'].value_counts()
most_common_gender = gender_counts.idxmax()
print("The most common gender among patients is:", most_common_gender)
The most common gender among patients is: Female
In [46]:
gender_counts = operations['gender'].value_counts()
print(gender_counts)
Female                44483
Male                  20317
Other                   764
Decline to Specify      312
female                    4
male                      2
Unknown                   1
Name: gender, dtype: int64
In [47]:
operations["gender"] = operations["gender"].replace("female", "Female")
In [48]:
operations["gender"] = operations["gender"].replace("male", "Male")
In [49]:
gender_counts = operations['gender'].value_counts()
print(gender_counts)
Female                44487
Male                  20319
Other                   764
Decline to Specify      312
Unknown                   1
Name: gender, dtype: int64
In [50]:
import matplotlib.pyplot as plt

# Get gender counts
gender_counts = operations['gender'].value_counts()

# Create bar graph
fig, ax = plt.subplots()
ax.bar(gender_counts.index, gender_counts.values)

# Set axis labels and title
ax.set_xlabel('Gender')
ax.set_ylabel('Number of Patients')
ax.set_title('Patient Gender Distribution')

# Display graph
plt.show()
In [51]:
# Convert dob column to timezone-aware datetime object
operations['dob'] = pd.to_datetime(operations['dob'])

# Calculate age by subtracting dob from current date
now = datetime.now()
operations['age'] = (now.date() - operations['dob'].dt.date).astype('<m8[Y]')

# Calculate the average age
avg_age = operations['age'].mean()
print(f"The average age of patients is {avg_age:.2f} years")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
/var/folders/bl/5kk4pcgd6vqdpfcs3h7lpqn80000gn/T/ipykernel_1283/883925839.py in <module>
      3 
      4 # Calculate age by subtracting dob from current date
----> 5 now = datetime.now()
      6 operations['age'] = (now.date() - operations['dob'].dt.date).astype('<m8[Y]')
      7 

NameError: name 'datetime' is not defined
In [52]:
from datetime import datetime

# Convert dob column to timezone-aware datetime object
operations['dob'] = pd.to_datetime(operations['dob'])

# Calculate age by subtracting dob from current date
now = datetime.now()
operations['age'] = (now.date() - operations['dob'].dt.date).astype('<m8[Y]')

# Calculate the average age
avg_age = operations['age'].mean()
print(f"The average age of patients is {avg_age:.2f} years")

# Calculate the average gender
avg_gender = operations['gender'].mode()[0]
print(f"The average gender of patients is {avg_gender}")
The average age of patients is 31.30 years
The average gender of patients is Female
In [53]:
import plotly.express as px

# Create a copy of the operations dataframe with only age and gender columns
age_gender_df = operations[['age', 'gender']].copy()

# Group the dataframe by age and gender and count the number of patients in each group
age_gender_counts = age_gender_df.groupby(['age', 'gender']).size().reset_index(name='count')

# Plot the bubble chart
fig = px.scatter(age_gender_counts, x='age', y='count', size='count', color='gender', title='Distribution of Age and Gender of Patients')
fig.show()
In [54]:
# Find the minimum and maximum age by gender
age_by_gender = operations.groupby('gender')['age'].agg(['min', 'max'])

# Print the results
print(f"The oldest patient is {age_by_gender.loc[age_by_gender['max'].idxmax()]['max']:.0f} years old and is {age_by_gender['max'].idxmax()}")
print(f"The youngest patient is {age_by_gender.loc[age_by_gender['min'].idxmin()]['min']:.0f} years old and is {age_by_gender['min'].idxmin()}")
The oldest patient is 99 years old and is Female
The youngest patient is 2 years old and is Female
In [55]:
!pip install shap
Requirement already satisfied: shap in ./opt/anaconda3/lib/python3.9/site-packages (0.41.0)
Requirement already satisfied: slicer==0.0.7 in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (0.0.7)
Requirement already satisfied: packaging>20.9 in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (21.0)
Requirement already satisfied: pandas in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (1.3.4)
Requirement already satisfied: numba in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (0.54.1)
Requirement already satisfied: scikit-learn in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (0.24.2)
Requirement already satisfied: scipy in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (1.7.1)
Requirement already satisfied: cloudpickle in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (2.0.0)
Requirement already satisfied: numpy in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (1.20.3)
Requirement already satisfied: tqdm>4.25.0 in ./opt/anaconda3/lib/python3.9/site-packages (from shap) (4.62.3)
Requirement already satisfied: pyparsing>=2.0.2 in ./opt/anaconda3/lib/python3.9/site-packages (from packaging>20.9->shap) (3.0.4)
Requirement already satisfied: setuptools in ./opt/anaconda3/lib/python3.9/site-packages (from numba->shap) (58.0.4)
Requirement already satisfied: llvmlite<0.38,>=0.37.0rc1 in ./opt/anaconda3/lib/python3.9/site-packages (from numba->shap) (0.37.0)
Requirement already satisfied: python-dateutil>=2.7.3 in ./opt/anaconda3/lib/python3.9/site-packages (from pandas->shap) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in ./opt/anaconda3/lib/python3.9/site-packages (from pandas->shap) (2021.3)
Requirement already satisfied: six>=1.5 in ./opt/anaconda3/lib/python3.9/site-packages (from python-dateutil>=2.7.3->pandas->shap) (1.16.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in ./opt/anaconda3/lib/python3.9/site-packages (from scikit-learn->shap) (2.2.0)
Requirement already satisfied: joblib>=0.11 in ./opt/anaconda3/lib/python3.9/site-packages (from scikit-learn->shap) (1.1.0)
In [56]:
import shap
In [57]:
import matplotlib.pyplot as plt

# Data
count_utm_medium = {'api': 21971, 'organic': 15911, 'email': 9596, 'cpc': 7590, 'text': 4916, 'call': 2480, 'booking-link': 1719, 'switch': 1177, 'profile-link': 499, 'affliate': 18, 'Yext': 2, 'form': 1, 'email for scheduling.': 1, 'referral': 1, "email'": 1}

# Define threshold
threshold = 2.5

# Get total count
total_count = sum(count_utm_medium.values())

# Compute percentages and filter labels
labels = []
sizes = []
colors = ['#aebfff', '#80df56', '#009E73', '#F0E442', '#0072B2', '#00eacc', '#769d23']
for label, count in count_utm_medium.items():
    percent = 100 * count / total_count
    if percent >= threshold:
        labels.append(f"{label}\n({percent:.1f}%)")
        sizes.append(count)
        if label == 'api':
            colors.append('tab:blue')
        elif label == 'organic':
            colors.append('tab:orange')
        elif label == 'email':
            colors.append('tab:green')
        else:
            colors.append('tab:gray')

# Set up the plot
fig, ax = plt.subplots()
ax.axis('equal')

# Draw the pie chart
wedges, texts, _ = ax.pie(sizes, wedgeprops=dict(width=0.5), startangle=-40, autopct='', colors=colors)

# Set the labels and add a legend
ax.legend(wedges, labels, loc="best", bbox_to_anchor=(0.9, 0.9))

# Set the title
ax.set_title("UTM Medium Distribution")
plt.show()
In [58]:
import matplotlib.pyplot as plt

# Data
count_utm_medium = {'api': 21971, 'organic': 15911, 'email': 9596, 'cpc': 7590, 'text': 4916, 'call': 2480, 'booking-link': 1719, 'switch': 1177, 'profile-link': 499, 'affliate': 18, 'Yext': 2, 'form': 1, 'email for scheduling.': 1, 'referral': 1, "email'": 1}

# Define threshold
threshold = 2.5

# Get total count
total_count = sum(count_utm_medium.values())

# Compute percentages and filter labels
labels = []
sizes = []
colors = ['#aebfff', '#80df56', '#009E73', '#F0E442', '#0072B2', '#00eacc', '#769d23']
for label, count in count_utm_medium.items():
    percent = 100 * count / total_count
    if percent >= threshold:
        labels.append(f"{label}\n({percent:.1f}%)")
        sizes.append(count)
        if label == 'api':
            colors.append('tab:blue')
        elif label == 'organic':
            colors.append('tab:orange')
        elif label == 'email':
            colors.append('tab:green')
        else:
            colors.append('tab:gray')

# Set up the plot
fig, ax = plt.subplots()
ax.axis('equal')

# Draw the pie chart
wedges, texts, _ = ax.pie(sizes, wedgeprops=dict(width=0.5), startangle=-40, autopct='', colors=colors)

# Set the labels and add a legend
ax.legend(wedges, labels, loc="best", bbox_to_anchor=(0.9, 0.9))


# Set the title
ax.set_title("UTM Medium Distribution")

# Set the title position and rotation
ax.title.set_position([1.05, 0.5])
ax.title.set_rotation(90)

plt.show()
In [ ]: